import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px
import plotly.graph_objects as go
# load us data
data_dir_us = '../preprocess/data/con/us'
df_us = pd.DataFrame()
for dirname, _, filenames in os.walk(data_dir_us):
for filename in filenames:
if filename == 'all.csv':
# if (filename == 'all.csv') & (dirname.split('/')[-1] not in ['2018','2019']): # us vs jp ver.
tmp = pd.read_csv(os.path.join(dirname, filename))
df_us = pd.concat([df_us, tmp],axis=0).reset_index(drop=True)
print(os.path.join(dirname, filename), tmp.shape, df_us.shape)
df_us['date'] = pd.to_datetime(df_us['date'])
# load jp data
data_dir_jp = '../preprocess/data/con/jp'
df_jp = pd.DataFrame()
for dirname, _, filenames in os.walk(data_dir_jp):
for filename in filenames:
if filename == 'all.csv':
tmp = pd.read_csv(os.path.join(dirname, filename))
df_jp = pd.concat([df_jp, tmp],axis=0).reset_index(drop=True)
print(os.path.join(dirname, filename), tmp.shape, df_jp.shape)
df_jp['date'] = pd.to_datetime(df_jp['date'])
# normalization, data preprocessing
d_xlim = {
'danceability': [0,1],
'energy': [0,1],
'loudness': [-20,0],
'speechiness': [0,1],
'acousticness': [0,1],
'liveness': [0,1],
'valence': [0,1],
'tempo': [0,200],
'duration_ms': [0,4e+5]
}
def normalize(df):
print(df.shape)
for feature in d_xlim.keys():
df = df[(df[feature]<=d_xlim[feature][1]) & (df[feature]>=d_xlim[feature][0]) ]
print(df.shape)
for feature in d_xlim.keys():
df[feature] = df[feature] / np.absolute(d_xlim[feature][1] - d_xlim[feature][0])
return df
df_norm_us = df_us.copy()
df_norm_us = normalize(df_norm_us)
df_norm = df_norm_us.copy()
remove_cols = ['title', 'rank', 'date', 'artist', 'url', 'region', 'chart', 'streams','year', 'month', 'mode', 'key', 'instrumentalness','time_signature']
use_cols = [col for col in df_norm.columns if col not in remove_cols]
X = df_norm[use_cols].drop_duplicates().to_numpy()
X_fit = df_norm[use_cols].to_numpy()
# features vizualization (check if there are highly correlated features)
fig = px.scatter_matrix(df_norm[use_cols],
width=1200, height=1600)
fig.show()